#Replication data for "The Promises and Pitfalls of 311 Data", Urban Affairs Review
#Ariel White and Kris-Stella Trump
#email Ariel (arwhi@mit.edu) with questions
#October 2016

## this file: pulling in the campaign donations data and prepping it to merge into the main dataset.

rm(list=ls())
#setwd("C:/Users/Ariel White/Dropbox (MIT)/final311repdata_forposting/donations")#windows computer filepath for testing. update to your own!

library(foreign)
library(data.table)
#pull in giant file.
test <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=5000)) #downloaded from http://data.influenceexplorer.com/bulk/
keeptest <- test[contributor_state=="NY" & contributor_type=="I",]; dim(keeptest)

test1 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=5000, skip=1000000, header=F))
keeptest <- test1[V19=="NY" & V13=="I",]; dim(keeptest)

##okay, now loop through, pull it in in big chunks, keep just rows from NYS 
##this command in the terminal: wc -l contributions.nimsp.2010.csv
## tells me there are 4294029 lines in this file.

#eh, not sure it's memory-efficient to loop it and use assign or whatever, just copy the code
test1 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=900000, header=F))
keeptest1 <- test1[V19=="NY" & V13=="I",]; dim(keeptest1); rm(test1)
test2 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=900000, header=F, skip=900000))
keeptest2 <- test2[V19=="NY" & V13=="I",]; dim(keeptest2); rm(test2)
test3 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=900000, header=F, skip=(900000*2)))
keeptest3 <- test3[V19=="NY" & V13=="I",]; dim(keeptest3); rm(test3)
test4 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=900000, header=F, skip=(900000*3)))
keeptest4 <- test4[V19=="NY" & V13=="I",]; dim(keeptest4); rm(test4)
test5 <- data.table(read.csv("contributions.nimsp.2010.csv", nrow=900000, header=F, skip=(900000*4)))
keeptest5 <- test5[V19=="NY" & V13=="I",]; dim(keeptest5); 
tail(test5)
rm(test5)

allnysdonations <- as.data.frame(rbind(keeptest1, keeptest2, keeptest3, keeptest4, keeptest5))
dim(allnysdonations); nrow(keeptest1)+ nrow(keeptest2)+ nrow(keeptest3)+ nrow(keeptest4)+ nrow(keeptest5) 
colnames(allnysdonations) <- colnames(test)
#okay, save it.
#write.csv(allnysdonations, "allnysindividualdonations2010cycle.csv")
#and cut to just NYC
cities <- table(allnysdonations$contributor_city) #want to see how common typos are.
cities[10150:10850]
newyorks <- c("NEW YORK", "NEW YOK", "NEWYORK", "NEW YORK,", "NEW-YORK", "NEW YOUK", "NEW YOURK", "NEW YORK CITY", "NEW YORK, NY", "NEW YORKQ", "NEW YORY", "NEW YROK", "NE YORK", "NY", "NYC")
allnycdonations <- allnysdonations[allnysdonations$contributor_city %in% newyorks,]; dim(allnysdonations); dim(allnycdonations)

write.csv(allnycdonations, "allnycindividualdonations2010cycle.csv")

## next up, need to geocode these donations. 
## NOTE: don't run this code all the time b/c it takes forever.  Skip to the post-geocoding point below if possible.

rm(list=ls())
library(data.table)
library(foreign)

#borrowed some code from: http://www.shanelynn.ie/massive-geocoding-with-r-and-google-maps/
# Geocoding script for large list of addresses.# Shane Lynn 10/10/2013
 
#load up the ggmap library
library(ggmap)
# get the input data
infile <- "allnycindividualdonations2010cycle"
data <- read.csv(paste0('./', infile, '.csv')) #25468

straddresses = data$contributor_address
straddresses = sub("(.*?)#.*", "\\1", straddresses) #drop everything after #, since apt. numbers make it go awry.
cityaddresses = data$contributor_city
staddresses = data$contributor_state
zaddresses <- as.character(data$contributor_zip)
zaddresses[is.na(zaddresses)] <- "" #replace NA's with blanks.
addresses = paste0(straddresses, ", ",cityaddresses, ", ", staddresses,  ", ", zaddresses,", USA")
 
write.csv(addresses, "allnycinddonations_2010cycle_justaddressesforgeocoding.csv")
#define a function that will process googles server responses for us.
getGeoDetails <- function(address){   
   #use the gecode function to query google servers
   geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
   #now extract the bits that we need from the returned list
   answer <- data.frame(lat=NA, long=NA, accuracy=NA, formatted_address=NA, address_type=NA, status=NA)
   answer$status <- geo_reply$status
 
   #if we are over the query limit - want to pause for an hour
   while(geo_reply$status == "OVER_QUERY_LIMIT"){
       print("OVER QUERY LIMIT - Pausing for 1 hour at:") 
       time <- Sys.time()
       print(as.character(time))
       Sys.sleep(60*60)
       geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
       answer$status <- geo_reply$status
   }
   #return Na's if we didn't get a match:
   if (geo_reply$status != "OK"){
       return(answer)
   }   
   #else, extract what we need from the Google server reply into a dataframe:
   answer$lat <- geo_reply$results[[1]]$geometry$location$lat
   answer$long <- geo_reply$results[[1]]$geometry$location$lng   
   if (length(geo_reply$results[[1]]$types) > 0){
       answer$accuracy <- geo_reply$results[[1]]$types[[1]]
   }
   answer$address_type <- paste(geo_reply$results[[1]]$types, collapse=',')
   answer$formatted_address <- geo_reply$results[[1]]$formatted_address
   return(answer)
}

#initialise a dataframe to hold the results
geocoded <- data.frame()
# find out where to start in the address list (if the script was interrupted before):
startindex <- 1
#if a temp file exists - load it up and count the rows!
tempfilename <- paste0(infile, '_temp_2010geocoded_jan2016.rds')
if (file.exists(tempfilename)){
       print("Found temp file - resuming from index:")
       geocoded <- readRDS(tempfilename)
       startindex <- nrow(geocoded)
       print(startindex)
}
# Start the geocoding process - address by address. geocode() function takes care of query speed limit.
for (ii in seq(startindex, length(addresses))){
   print(paste("Working on index", ii, "of", length(addresses)))
   #query the google geocoder - this will pause here if we are over the limit.
   result = getGeoDetails(addresses[ii]) 
   print(result$status)     
   result$index <- ii
   #append the answer to the results file.
   ## ARW edit: sometimes it returns too small an object. if this happens, pad it out.
   result$formatted_address <- ifelse(is.null(result$formatted_address), "", result$formatted_address)
   result$accuracy <- ifelse(is.null(result$accuracy), "", result$accuracy)
   geocoded <- rbind(geocoded, result)
   #save temporary results as we are going along
   saveRDS(geocoded, tempfilename)
}

 #now we add the latitude and longitude to the main data
data$lat <- geocoded$lat
data$long <- geocoded$long
data$accuracy <- geocoded$accuracy
 
#finally write it all to the output files
saveRDS(data, paste0(infile ,"_2010geocoded_jan2016.rds"))
write.table(data, file=paste0(infile ,"_2010geocoded_jan2016.csv"), sep=",", row.names=FALSE)


########################################
#can also run code from here (don't re-run geocoding all the time); just make sure you set the right WD.

rm(list=ls())
library(data.table)
library(foreign)

infile <- "allnycindividualdonations2010cycle"
data <- read.table(file=paste0(infile ,"_2010geocoded_jan2016.csv"), sep=",", header=T) #load in the geocoded version of the data from above.

##want to go through, fix weirdly formed addresses (automatically or manually), and redo geocoding for just those.

library(maps)
library(maptools)
library(classInt)
library(gpclib)
library(mapdata)

#quick check for weirdness
problems <- data[( data$long<(-74.3)| data$long>(-73.6) | data$lat<40.3 | data$lat>40.9 | is.na(data$long)==T) | !(data$accuracy %in% c("street_address", "premise", "subpremise")), ]
dim(problems); dim(data) 

head(problems); tail(problems); as.character(problems$contributor_address[100:150]); as.character(problems$contributor_address[210:250])
## it looks like a bunch of these are blank, but others are malformed.
## could automatically fix some by dropping anything after "PH" or "PENTHOUSE" or "UNREADABLE" or PMB or APT or ST, while fixing spacing on others by hand.
## and some might have the wrong fields entered for address? or just weird stuff in here.
## wait on this for now. it's about 1000 out of 25000.
## okay, so what I'm going to do (2/8/2016) is:
## 1.) ignore unusable addresses (blank/unreadable)
## 2.) then automatically fix as many as possible of the remainder as discussed above
## 3.) then output the whole list to a txt file, skim it and hand-correct weird stuff I see. 
## 4.) then load it back in, and try re-geocoding it.

test <- as.character(problems$contributor_address[210:250]) #play around with fixes here.
testadds = sub("(.*?) PH.*", "\\1", test)
testadds1 = sub("(.*?) PENTHOUSE.*", "\\1", testadds)

#1.) how many are just missing addresses?
missingproblemadds <- problems[problems$contributor_address=="",]; dim(missingproblemadds) #289.  nothing to be done with these.

#2.) fix address formatting
straddresses = problems$contributor_address
straddresses = sub("(.*?) PH.*", "\\1", straddresses) 
straddresses = sub("(.*?) PENTHOUSE.*", "\\1", straddresses) 

#3.) go fix street address stuff by hand
library(foreign)
write.csv(straddresses, "donationaddresses_problemsforhandcorrection_feb2016.txt", row.names=F)
#then go fix it, save as new filename
#note I fixed all the obviously fixable but did nothing about weird stuff (employer name, C/O some business, completely unintelligible things)
straddresses <- read.csv("donationaddresses_problemsforhandcorrection_feb2016_fixed.txt",blank.lines.skip=F)

#4.) put it all back together and re-geocode.
cityaddresses = problems$contributor_city
staddresses = problems$contributor_state
zaddresses <- as.character(problems$contributor_zip)
zaddresses[is.na(zaddresses)] <- "" #replace NA's with blanks.
problemaddresses <- paste0(straddresses[,1], ", ",cityaddresses, ", ", staddresses,  ", ", zaddresses,", USA")

length(problemaddresses)
library(ggmap)

## see code in the top chunk to set up this function.
getGeoDetails <- function(address){   
   #use the gecode function to query google servers
   geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
   #now extract the bits that we need from the returned list
   answer <- data.frame(lat=NA, long=NA, accuracy=NA, formatted_address=NA, address_type=NA, status=NA)
   answer$status <- geo_reply$status
 
   #if we are over the query limit - want to pause for an hour
   while(geo_reply$status == "OVER_QUERY_LIMIT"){
       print("OVER QUERY LIMIT - Pausing for 1 hour at:") 
       time <- Sys.time()
       print(as.character(time))
       Sys.sleep(60*60)
       geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
       answer$status <- geo_reply$status
   }
   #return Na's if we didn't get a match:
   if (geo_reply$status != "OK"){
       return(answer)
   }   
   #else, extract what we need from the Google server reply into a dataframe:
   answer$lat <- geo_reply$results[[1]]$geometry$location$lat
   answer$long <- geo_reply$results[[1]]$geometry$location$lng   
   if (length(geo_reply$results[[1]]$types) > 0){
       answer$accuracy <- geo_reply$results[[1]]$types[[1]]
   }
   answer$address_type <- paste(geo_reply$results[[1]]$types, collapse=',')
   answer$formatted_address <- geo_reply$results[[1]]$formatted_address
   return(answer)
}


problemadds1 <- data.frame()
# find out where to start in the address list (if the script was interrupted before):
startindex <- 1

# Start the geocoding process - address by address. geocode() function takes care of query speed limit.
for (ii in seq(1, length(problemaddresses))){
   print(paste("Working on index", ii, "of", length(problemaddresses)))
   #query the google geocoder - this will pause here if we are over the limit.
   result = getGeoDetails(problemaddresses[ii]) 
   print(result$status)     
   result$index <- ii
   #append the answer to the results file.
   ## ARW edit: sometimes it returns too small an object. if this happens, pad it out.
   result$formatted_address <- ifelse(is.null(result$formatted_address), "", result$formatted_address)
   result$accuracy <- ifelse(is.null(result$accuracy), "", result$accuracy)
   problemadds1 <- rbind(problemadds1, result)
   #save temporary results as we are going along
   #saveRDS(geocoded, tempfilename)
}


#stick this to the other data. ###
dim(data); dim(problemadds1)
#how best to do this? need to replace codings for the problem rows, not just tack them on.
#maybe up top should save the indices for the ones I'm pulling out?  then easy to drop them/reattach? just need to make sure we've got the same # columns.
#or just find the differences & stick those together.

fun.12 <- function(x.1,x.2,...){ #from here: http://www.r-bloggers.com/identifying-records-in-data-frame-a-that-are-not-contained-in-data-frame-b-%E2%80%93-a-comparison/
     x.1p <- do.call("paste", x.1)
     x.2p <- do.call("paste", x.2)
     x.1[! x.1p %in% x.2p, ]}
nonproblems <- fun.12(data, problems)
dim(nonproblems); dim(problemadds1)

#trim down and stick them together
problems$lat <- problemadds1$lat
problems$long <- problemadds1$long
problems$accuracy <- problemadds1$accuracy
dim(problems); dim(nonproblems)

betterdata <- rbind(nonproblems, problems)
dim(betterdata)
#maybe save this as something else so we won't lose the ability to see old results.
names(betterdata)[names(betterdata)=="long"] <- "lon"

### save this (so as not to rerun the google geocoding either), and restart the code AGAIN.
write.table(betterdata, file="fulldata_geocoded_march2016.csv", sep=",", row.names=FALSE)

## look at continued problems
stillproblems <- betterdata[( betterdata$lon<(-74.3)| betterdata$lon>(-73.6) | betterdata$lat<40.3 | betterdata$lat>40.9) , ] #including NA's, which are also problems.
dim(stillproblems)
head(stillproblems) #what is this?
#anyway, not nearly as many as before, awesome.
sum(is.na(betterdata$id)) #none of this business.


#######################################################################
## run from here (don't need to run above code.)
#######################################################################

rm(list=ls())
library(data.table)
#setwd("") #need to set this if starting code here
library(foreign)
fulldata <- read.table(file="fulldata_geocoded_march2016.csv", sep=",", header=T)


#(making some maps for validation)
## try based on this instead: http://stackoverflow.com/questions/11648870/offline-plotting-of-map-coordinates-on-static-maps-of-google

#points <- fulldata[is.na(fulldata$lat)==F & is.na(fulldata$lon)==F,]; dim(points); dim(fulldata)
#example <- points[2,]
#mapImageData <- get_googlemap(c(lon = example$lon, lat= example$lat), zoom=17)
#example$label1 <- paste(example$contributor_address,example$lat, example$lon, sep=", " )
## Plot
#ggmap(mapImageData) +
#    geom_point(aes(x=lon, y=lat), data=example, colour="red", size=5) +
#    geom_text(data = example, aes(x = lon, y = lat, label = label1), size = 4, vjust = 0, hjust = -0.15)
##okay,now just need to add some text onto this and loop it over many rows.


#set.seed(02138)
#samplerows <- points[sample(nrow(points), 50), ]  #pull a 50-row sample of addresses to plot.  note this still omits those with missing lat/long.

#testIL <- samplerows[19,] #here's one I know comes up wrong.  but why?
#testILadd <- paste(testIL$contributor_address, testIL$contributor_city, testIL$contributor_state, testIL$contributor_zip, "USA", sep=",")
#tester <- geocode(testILadd, output = "latlon", #warning: this takes a while too.
#	source = "google", messaging = TRUE, sensor = FALSE, override_limit = FALSE, client = "", signature = "", nameType = c("long", "short"))
##why??? It's the #2J.  If I drop this, it works. try to fix this above.


#setwd("./validationmaps2016") #stick these all in a folder.
#for (i in 1:nrow(samplerows)){
#	name = paste("campaignfinance_testmaps_", as.character(i),".pdf", sep="") #generate filename
#	example <- samplerows[i,]
#	mapImageData <- get_googlemap(c(lon = example$lon, lat= example$lat), zoom=17) #pull map
#	 #plot point, add text
#	example$label1 <- paste(example$contributor_address,example$lat, example$lon, sep=", " )
#	testmap <- ggmap(mapImageData) +
#    		geom_point(aes(x=lon, y=lat), data=example, colour="red", size=5) +
#    		geom_text(data = example, aes(x = lon, y = lat, label = label1), size = 4, vjust = 0, hjust = -0.15)
#	ggsave(file=name) #save map
#}



## okay, so we should go through and validate this.  Once we've decided it's good, we can collapse this data to tracts and compare to 311 data.

# for that, borrowing code from the NYC311_data_prep_merge_code script
#Load packages for geographic data analysis
library(maps)
library(maptools)
library(sp)
library(rgdal)

#Read in tract shapefiles from the Census Bureau https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html accessed 17 January 2014.
#extract current working directory
currwd <- getwd()
#setwd(paste(currwd,"/NYtractshapefiles",sep="")) #this was under old setup
setwd("../NYtractshapefiles")
NYCtracts <- readOGR(dsn=".", layer = "tl_2010_36_tract10")

setwd(paste(currwd))
summary(NYCtracts); proj4string(NYCtracts)

fulldata2 <- fulldata1 <- subset(fulldata, (is.na(fulldata$lat)==F & is.na(fulldata$lon)==F))
dim(fulldata); dim(fulldata1)

#Full sample
coordinates(fulldata1) <- c("lon", "lat")
proj4string(fulldata1) <- CRS("+proj=longlat")
nysampleT1 <- spTransform(fulldata1, CRS(proj4string(NYCtracts)))

class(nysampleT1); class(NYCtracts)
join1 <- over(nysampleT1, NYCtracts)
dim(join1); head(join1)
class(join1); class(fulldata1)

NYCdonationsfull1 <- data.table(cbind(fulldata2, join1)) 
#okay, now clean up some fields, do some summary stats, and then go figure out how stick them to 311 data.
summary(NYCdonationsfull1$amount) #what?
neg <- NYCdonationsfull1[NYCdonationsfull1$amount <0,]
#I'm going to guess these are errors? Unless there's some sort of returning-donations situation.
NYCdonationsfull1$dollaramount<- abs(NYCdonationsfull1$amount)
summary(NYCdonationsfull1$dollaramount) #some of these lower values are likely errors? also, some missingness.

#now collapse to tract-level counts of donations (for now, no time restrictions)
#coming back now and wanting to do some sort of time restriction: 6mos before election? 3mos? all 2012?
NYCdonationsfull1[,donation := 1]

head(as.character(NYCdonationsfull1$date))
NYCdonationsfull1$donationdate <- (as.Date(as.character(NYCdonationsfull1$date)))
NYCdonationsfull1[, year2010_donation:=0]
NYCdonationsfull1[donationdate > "2009-12-31" , year2010_donation:=1]
dim(NYCdonationsfull1); sum(NYCdonationsfull1$year2010_donation) #about half.

NYCdonationsfull1[, election10_6mosprec_donation:=0]
NYCdonationsfull1[donationdate > "2010-05-02"  & donationdate < "2010-11-03" , election10_6mosprec_donation:=1]
dim(NYCdonationsfull1); sum(NYCdonationsfull1$election10_6mosprec_donation) 

NYCdonationsfull1[, election10_3mosprec_donation:=0]
NYCdonationsfull1[donationdate > "2010-08-02"  & donationdate < "2010-11-03" , election10_3mosprec_donation:=1]
dim(NYCdonationsfull1); sum(NYCdonationsfull1$election10_3mosprec_donation) 

NYCdonationsfull1[, election10_3mosprec_donation_dollars:=0]
NYCdonationsfull1[donationdate > "2010-08-02"  & donationdate < "2010-11-03" , election10_3mosprec_donation_dollars:=dollaramount]
dim(NYCdonationsfull1); sum(NYCdonationsfull1$election10_3mosprec_donation_dollars) 

NYCdonationsfull1[, election10_6mosprec_donation_dollars:=0]
NYCdonationsfull1[donationdate > "2010-05-02"  & donationdate < "2010-11-03" , election10_6mosprec_donation_dollars:=dollaramount]
sum(NYCdonationsfull1$dollaramount, na.rm=T); sum(NYCdonationsfull1$election10_6mosprec_donation_dollars) 

tractdonations <- NYCdonationsfull1[, list(donations2010_dollartotal = sum(dollaramount, na.rm=T) , donations2010_6mospreelection_dollartotal = sum(election10_6mosprec_donation_dollars, na.rm=T), donations2010_3mospreelection_dollartotal = sum(election10_3mosprec_donation_dollars, na.rm=T) , donations2010_count = sum(donation, na.rm=T), donations2010_year2012only_count = sum(year2010_donation, na.rm=T), donations2010_3mospreelection_count = sum(election10_3mosprec_donation, na.rm=T), donations2010_6mospreelection_count = sum(election10_6mosprec_donation, na.rm=T)) , by= list(GEOID10)]

dim(tractdonations)
head(tractdonations)
summary(tractdonations)


save(NYCdonationsfull1, file="NYCdonations2010_geocoded_tractIDS.Rdata")
save(tractdonations, file="NYCdonations2010_tractlevel.Rdata")


